library(data.table)
library(lubridate)
library(extrafont)
library(ggplot2)
library(stargazer)
library(tidyr)
library(stringr)
library(xtable)
require(beepr)
require(VennDiagram)
PRELIM <- ''
PAPER_DIR <- ''
setwd(PAPER_DIR)
set.seed(42)
options(scipen = 999)
PATH <- setNames(c(file.path(PAPER_DIR, 'Data/RSessions/overcompliance.RData'),
 file.path(PAPER_DIR, 'Figures/current_figs'),
 file.path(PAPER_DIR, 'Latex/current_tex'),
 file.path(PAPER_DIR, 'Data/patents/'),
 file.path(PAPER_DIR, 'Data/patent.tsv'),
 file.path(PAPER_DIR, 'Data/application.tsv'),
 file.path(PAPER_DIR, 'Data/uspc.tsv'),
 file.path(PAPER_DIR, 'Data/patent_assignee.tsv'),
 file.path(PAPER_DIR, 'Data/patent_lawyer.tsv'),
 file.path(PAPER_DIR, 'Data/patent_inventor.tsv'),
 file.path(PAPER_DIR, 'Data/location.tsv'),
 file.path(PAPER_DIR, 'Data/location_inventor.tsv'),
 file.path(PAPER_DIR, 'Data/assignee.tsv'),
 file.path(PAPER_DIR, 'Data/rejections_uspto.csv'),
 file.path(PAPER_DIR, 'Data/rejections_kuhn.csv'),
 file.path(PAPER_DIR, 'Data/Kuhn-Younge-Marco_Patent_Citation_Source_and_Timing_2017-09-25.csv'),
 file.path(PAPER_DIR, 'Data/uspatentcitation.tsv'),
 file.path(PAPER_DIR, 'Data/usapplicationcitation.tsv')),
 c('RData', 'Figures', 'Latex', 'google_patents', 
 'patent', 'patent_application', 'patent_uspc', 'patent_assignee', 'patent_lawyer', 'patent_inventor', 
 'location', 'location_inventor', 'assignee', 
 'rejections_uspto', 'rejections_kuhn', 'source_timing', 'Cites_pat', 'Cites_pub'))
windowsFonts(Palatino=windowsFont("Palatino Linotype"))
GG_x_range_date <- c(as.Date("1980-01-01"), as.Date("2014-12-31"))
GG_font <- "Palatino"
GG_size <- 3
GG_dpi <- 600
GG_theme <- theme(axis.text = element_text(size = 8, family = GG_font, colour = "black"),
 axis.title = element_text(size = 8, family = GG_font, colour = "black"),
 axis.ticks = element_line(colour = "black"),
 plot.title = element_text(size = 8, family = GG_font, colour = "black", hjust = 0.5),
 axis.line = element_line(colour = "black"),
 legend.key = element_blank(),
 legend.background = element_rect(fill = 'transparent'),
 legend.text = element_text(size = 6),
 legend.position = c(.4, .3),
 axis.line.x = element_line(),
 axis.line.y = element_line())

patents <- fread(PATH['patent'], quote = "")
patent_application <- fread(PATH['patent_application'])
patent_uspc <- fread(PATH['patent_uspc'])
patent_assignee <- fread(PATH['patent_assignee'], quote = "")
patent_lawyer <- fread(PATH['patent_lawyer'])
patent_inventor <- fread(PATH['patent_inventor'])
location <- fread(PATH['location'], quote = "")
location_inventor <- fread(PATH['location_inventor'])
assignee <- fread(PATH['assignee'], quote = "")
cites_pat <- fread(PATH['Cites_pat'])
cites_pub <- fread(PATH['Cites_pub'])
rejections_kuhn <- fread(PATH['rejections_kuhn'])
rejections_uspto <- fread(PATH['rejections_uspto'])
cite_source_timing <- fread(PATH['source_timing'])

# Patents.
patents[, pat_num := as.integer(id)]
patent_application[, pat_num := as.integer(patent_id)]
patent_application[, app_num := as.integer(number)]
patent_uspc[, pat_num := as.integer(patent_id)]
patent_assignee[, pat_num := as.integer(patent_id)]
patent_lawyer[, pat_num := as.integer(patent_id)]
patent_inventor[, pat_num := as.integer(patent_id)]
patents[, pat_date := ymd(date)]
patents[, pat_year := year(pat_date)]
patent_application[, app_date := ymd(date)]
patent_application[, app_year := year(app_date)]
assignee[, is_foreign := ifelse(type == 3, 1, 0)]
assignee[, is_domestic := ifelse(type == 2, 1, 0)]
assignee[is.na(is_foreign), is_foreign := 0]
assignee[is.na(is_domestic), is_domestic := 0]
patents <- patents[ !is.na(pat_num) & !is.na(pat_date) & type == 'utility', c('pat_num', 'num_claims', 'pat_date', 'pat_year')]
patent_application <- patent_application[!is.na(pat_num) & !is.na(app_date), c('pat_num', 'app_num', 'app_date', 'app_year')]
patent_uspc <- patent_uspc[ !is.na(pat_num) & sequence == 0 & subclass_id != 'No longer published', c('pat_num', 'mainclass_id', 'subclass_id')]
patent_assignee <- patent_assignee[ !is.na(pat_num), c('pat_num', 'assignee_id')]
patent_lawyer <- patent_lawyer[ !is.na(pat_num),]
patent_inventor <- patent_inventor[ !is.na(pat_num), c('pat_num', 'inventor_id')]
location <- location[ country == 'US', c('id', 'state')]
assignee <- assignee[, c('id', 'is_foreign', 'is_domestic')]
patent_assg_count <- patent_assignee[, list(assg_count = .N), by = 'pat_num']
location <- location[state %in% c('AK', 'AL', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 
 'FL', 'GA', 'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 
 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MP', 'MS', 'MT', 
 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 
 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UM', 'UT', 
 'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY'), ]
patents <- merge(patents, patent_application, by = 'pat_num')
patents <- merge(patents, patent_uspc, by = 'pat_num', all.x = TRUE)
patents <- merge(patents, patent_assg_count, by = 'pat_num', all.x = TRUE)
patent_assignee <- merge(patent_assignee, assignee, by.x = 'assignee_id', by.y = 'id')
patent_state <- merge(patent_inventor, location_inventor, by.x = 'inventor_id', by.y = 'inventor_id', allow.cartesian = TRUE)
patent_state <- merge(patent_state, location, by.x = 'location_id', by.y = 'id')[, c('pat_num', 'state')]
patents[, exam_time := interval(app_date, pat_date)/years(1)]
patents[, is_lawyer := ifelse(pat_num %in% patent_lawyer$pat_num, 1, 0)]
patents[, is_foreign := ifelse(pat_num %in% patent_assignee[is_foreign==1, ]$pat_num, 1, 0)]
patents[, is_domestic := ifelse(pat_num %in% patent_assignee[is_domestic==1,]$pat_num, 1, 0)]
patents[is.na(assg_count), assg_count := 0]
merge(patents, patent_assignee[is_domestic == 1 | is_foreign == 1, c('pat_num', 'assignee_id')], by = 'pat_num', all.x = TRUE)
patents <- unique(patents, by = 'pat_num')
patent_assignee <- unique(patent_assignee, by = c('pat_num', 'assignee_id'))
patent_inventor <- unique(patent_inventor)
patent_state <- unique(patent_state)
location_inventor <- unique(location_inventor)

# Applications.
apps <- rbindlist(lapply(as.list(file.path(PATH['google_patents'], list.files(PATH['google_patents']))), fread))
apps[, priority_date := ymd(priority_date)]
apps[, app_date := ymd(filing_date)]
apps[, pat_date := ymd(grant_date)]
apps[, is_pat := ifelse(!is.na(pat_date), 1, 0)] # NOTE: This is the neat division.
apps[interval(priority_date, app_date) < 0, priority_date := app_date] # Fixes a few bad priority dates.
apps[, is_continuation := ifelse(interval(priority_date, app_date)/years(1) > 1.01, 1, 0)] # Identify continuations.
apps[!(nchar(art_unit) == 4) , art_unit := NA]
apps[!is.na(art_unit) , tech_center := paste0(substr(art_unit, 1, 2), "00")]
apps[!(tech_center %in% TECH_CENTERS), tech_center := NA]
apps[!(tech_center %in% TECH_CENTERS), art_unit := NA]
apps <- apps[nchar(doc_id) >= 12 & (is_pat == 1 | nchar(doc_id) == 16), ] # Drops 4,000. All are some weird thing (e.g., plant, reissue.)
apps <- apps[!is.na(app_date) & !is.na(priority_date), ] # Drops 750,000. All are patents granted before 1975.
apps[ , doc_id := gsub('US-', '', doc_id)]
apps <- separate(apps, doc_id, c('doc_id', 'a'), sep = '-')
apps[ , 'a' := NULL, ]
apps[is_pat == 0, doc_id := paste0(substr(doc_id, 1, 4), str_pad(substr(doc_id, 5, 25), width = 7, side = "left", pad = "0"))]
apps[ , doc_id := as.numeric(doc_id)]
apps <- apps[!is.na(doc_id), ] # Drops 24,000 reissues.
setkey(apps, 'app_num', 'doc_id', 'is_pat')
apps <- unique(apps, by = c('app_num', 'is_pat')) # Drops a few thousand duplicate rows.
app_doc_cor <- apps[, c('app_num', 'doc_id', 'is_pat')]
pat_merge <- app_doc_cor[app_doc_cor$is_pat == 1, c('app_num', 'doc_id')]
pub_merge <- app_doc_cor[app_doc_cor$is_pat == 0, c('app_num', 'doc_id')]
colnames(pat_merge) <- c('app_num', 'pat_num')
colnames(pub_merge) <- c('app_num', 'pub_num')
apps <- merge(apps, pat_merge, by = 'app_num', all.x = TRUE)
apps <- merge(apps, pub_merge, by = 'app_num', all.x = TRUE)
apps <- apps[is_pat == 1 | is.na(pat_num), c('pat_num', 'pub_num', 'is_continuation', 'tech_center', 'priority_date', 'family_id')]
patents <- merge(patents, apps[!is.na(pat_num), c('pat_num', 'is_continuation', 'tech_center', 'priority_date', 'family_id')], by = 'pat_num', all.x = TRUE)
patents[!is.na(family_id) & family_id < 10, family_id := NA]
patents[ , family_size := .N, by = 'family_id']
patents[is.na(family_id) , family_size := NA]

#Citations
cites_pat[, is_exam_cite_local := ifelse(category == 'cited by examiner', 1, ifelse(category == 'cited by applicant' | category == 'cited by other', 0, NA))]
cites_pub[, is_exam_cite_local := ifelse(category == 'cited by examiner', 1, ifelse(category == 'cited by applicant' | category == 'cited by other', 0, NA))]
cites_pat <- cites_pat[, c('patent_id', 'citation_id', 'is_exam_cite_local')]
cites_pub <- cites_pub[, c('patent_id', 'application_id', 'is_exam_cite_local')]
colnames(cites_pat) <- c('citing_pat_num', 'cited_pat_num', 'is_exam_cite_local')
colnames(cites_pub) <- c('citing_pat_num', 'cited_pub_num', 'is_exam_cite_local')
cites_pub <- cites_pub[nchar(cited_pub_num) == 16, ]
cites_pub[, cited_pub_num := as.numeric(substr(cited_pub_num, 6, 16))]
cites_pub[, citing_pat_num := as.numeric(citing_pat_num)]
cites_pat[, cited_pat_num := as.numeric(cited_pat_num)]
cites_pat[, citing_pat_num := as.numeric(citing_pat_num)]
cites_pub <- cites_pub[!is.na(citing_pat_num) & !is.na(cited_pub_num), ]
cites_pat <- cites_pat[!is.na(citing_pat_num) & !is.na(cited_pat_num), ]
pat_merge <- apps[!is.na(pat_num) & !is.na(pub_num), c('pat_num', 'pub_num')]
colnames(pat_merge) <- c('cited_pat_num', 'cited_pub_num')
cites_pat <- merge(cites_pat, pat_merge, by = 'cited_pat_num', all.x = TRUE)
cites_pub <- merge(cites_pub, pat_merge, by = 'cited_pub_num', all.x = TRUE)
cites_pat[, cite_type_local := 1]
cites_pub[, cite_type_local := 2]
cites <- rbind(cites_pat, cites_pub)
cites <- cites[, list(is_exam_cite_local = max(is_exam_cite_local, na.rm = TRUE),
 cite_type = sum(cite_type_local)), by = c('citing_pat_num', 'cited_pat_num', 'cited_pub_num')]
cites[!is.finite(is_exam_cite_local), is_exam_cite_local := NA]
cites[, cite_type := ifelse(cite_type == 1, 'Pat', 
 ifelse(cite_type == 2, 'Pub',
 ifelse(cite_type == 3, 'Both', NA)))]
cites[, cited_doc_id := ifelse(!is.na(cited_pat_num), cited_pat_num, cited_pub_num)]
pat_merge1 <- patents[, c('pat_num', 'app_year', 'pat_year', 'app_date', 'pat_date')]
pat_merge2 <- patents[, c('pat_num', 'app_year', 'pat_year', 'app_date', 'pat_date')]
colnames(pat_merge1) <- c('citing_pat_num', 'citing_app_year', 'citing_pat_year', 'citing_app_date', 'citing_pat_date')
colnames(pat_merge2) <- c('cited_pat_num', 'cited_app_year', 'cited_pat_year', 'cited_app_date', 'cited_pat_date')
cites <- merge(cites, pat_merge1, by = 'citing_pat_num')
cites <- merge(cites, pat_merge2, by = 'cited_pat_num')
citing_merge <- patents[, c('pat_num', 'is_foreign', 'exam_time', 'num_claims', 'tech_center', 'mainclass_id', 'subclass_id', 'is_lawyer')]
cited_merge <- patents[, c('pat_num', 'num_claims', 'subclass_id')]
colnames(citing_merge) <- c('citing_pat_num', 'citing_is_foreign', 'citing_exam_time', 'citing_num_claims', 'citing_tech_center', 'citing_mainclass', 'citing_subclass', 'citing_is_lawyer')
colnames(cited_merge) <- c('cited_pat_num', 'cited_num_claims', 'cited_subclass')
cites <- merge(cites, citing_merge, by = 'citing_pat_num')
cites <- merge(cites, cited_merge, by = 'cited_pat_num')
cites[, is_appl_cite_local := 1 - is_exam_cite_local]
cites[, applicant_cites := sum(is_appl_cite_local), by = 'citing_pat_num']
cites[, is_applicant_cites_0 := ifelse(applicant_cites == 0, 1, 0)]
cites[, is_applicant_cites_100 := ifelse(applicant_cites >= 100, 1, 0)]
cites[, age_diff_year := interval(cited_pat_date, citing_app_date)/years(1)]
cites[, is_same_subclass := ifelse(citing_subclass == cited_subclass, 1, 0)]
cites[, cite_id := seq(1, nrow(cites))]
citing_merge <- patent_assignee[, c('pat_num', 'assignee_id')]
cited_merge <- patent_assignee[, c('pat_num', 'assignee_id')]
colnames(citing_merge) <- c('citing_pat_num', 'citing_assg_id')
colnames(cited_merge) <- c('cited_pat_num', 'cited_assg_id')
cites_self <- merge(cites, citing_merge, by = 'citing_pat_num', all.x = TRUE)
cites_self <- merge(cites_self, cited_merge, by = 'cited_pat_num', all.x = TRUE)
cites_self <- cites_self[!is.na(citing_assg_id) & !is.na(cited_assg_id) & citing_assg_id == cited_assg_id, ]
cites[, is_self := ifelse(cite_id %in% cites_self$cite_id, 1, 0)]
cites[, cite_id := NULL]
cite_counts <- cites[, list(citing_forward_cites = .N), by = 'cited_pat_num']
cites <- merge(cites, cite_counts, by.x = 'citing_pat_num', by.y = 'cited_pat_num', all.x = TRUE)
cites[is.na(citing_forward_cites), forward_cites := 0]
colnames(cite_source_timing) <- c('citing_pat_num', 'cited_pat_num', 'citing_pat_year', 'cite_submission_date', 'is_exam_cite_bulk', 'is_exam_cite_uspto')
cite_source_timing[, citing_pat_year := NULL]
cite_source_timing[, is_exam_cite_bulk := NULL]
cite_source_timing[, is_appl_cite_uspto := 1 - is_exam_cite_uspto]
cite_source_timing[, citing_pat_num := as.numeric(citing_pat_num)]
cite_source_timing[, cited_pat_num := as.numeric(cited_pat_num)]
cite_source_timing <- unique(cite_source_timing, by = c('citing_pat_num', 'cited_pat_num'))
cites <- merge(cites, cite_source_timing, by = c('citing_pat_num', 'cited_pat_num'), all.x = TRUE)
cites[, cite_id := seq(1, nrow(cites))]
citing_merge <- copy(patent_state)
cited_merge <- copy(patent_state)
colnames(citing_merge) <- c('citing_pat_num', 'citing_inventor_state')
colnames(cited_merge) <- c('cited_pat_num', 'cited_inventor_state')
cites_states <- merge(cites[, c('cite_id', 'citing_pat_num', 'cited_pat_num')], citing_merge, by = 'citing_pat_num', allow.cartesian = TRUE)
cites_states <- merge(cites_states, cited_merge, by = 'cited_pat_num', allow.cartesian = TRUE)
cites_states <- cites_states[!is.na(citing_inventor_state) & !is.na(cited_inventor_state) & citing_inventor_state == cited_inventor_state, ]
cites[, is_same_state := ifelse(cite_id %in% cites_states$cite_id, 1, 0)]
rejections_kuhn[, citing_pat_num := as.numeric(citing_pat_num)]
rejections_kuhn[, cited_pat_num := as.numeric(cited_pat_num)]
rejections_kuhn <- rejections_kuhn[is_102 == 1 | is_103 == 1, c('citing_pat_num', 'cited_pat_num', 'is_102', 'is_103')]
rejections_uspto[, cited_doc_id := as.numeric(parsed)]
rejections_uspto[, citing_app_num := as.numeric(app_id)]
rejections_uspto[, is_102 := ifelse(action_type == 102, 1, 0)]
rejections_uspto[, is_103 := ifelse(action_type == 103, 1, 0)]
rejections_uspto <- rejections_uspto[citation_in_oa == 1, c('citing_app_num', 'cited_doc_id', 'is_102', 'is_103')]
pat_merge <- patents[!is.na(app_num), c('app_num', 'pat_num')]
colnames(pat_merge) <- c('citing_app_num', 'citing_pat_num')

# Rejections.
rejections_uspto <- merge(rejections_uspto, pat_merge, by = 'citing_app_num')
rejections_uspto <- rejections_uspto[, citing_app_num := NULL]
rejections_uspto_pat <- rejections_uspto[cited_doc_id < 20000000, ]
rejections_uspto_pub <- rejections_uspto[cited_doc_id > 20000000, ]
setnames(rejections_uspto_pat, 'cited_doc_id', 'cited_pat_num')
setnames(rejections_uspto_pub, 'cited_doc_id', 'cited_pub_num')
pat_merge <- apps[, c('pat_num', 'pub_num')]
colnames(pat_merge) <- c('cited_pat_num', 'cited_pub_num')
rejections_uspto_pub <- merge(rejections_uspto_pub, pat_merge, by = 'cited_pub_num')
rejections_uspto_pub[, cited_pub_num := NULL]
rejections_uspto <- rbind(rejections_uspto_pat, rejections_uspto_pub)
rm(rejections_uspto_pat, rejections_uspto_pub)
rejections_uspto <- rejections_uspto[!is.na(cited_pat_num), ]
rejections_uspto <- rejections_uspto[, list(is_102 = max(is_102),
 is_103 = max(is_103)), by = c('citing_pat_num', 'cited_pat_num')]
rejections <- rbind(rejections_uspto, rejections_kuhn)
rejections <- rejections[, list(is_102 = max(is_102),
 is_103 = max(is_103)), by = c('citing_pat_num', 'cited_pat_num')]
cites <- merge(cites, rejections, by = c('citing_pat_num', 'cited_pat_num'), all.x = TRUE)
cites[is.na(is_102), is_102 := 0]
cites[is.na(is_103), is_103 := 0]

# Calculate the portfolio size
cites_sub <- cites[is_self == 0 & !is.na(cited_pat_num) & citing_pat_year <= 2014, c('citing_pat_num', 'cited_pat_num', 'citing_pat_year', 'citing_app_year')]
patents_portfolio <- merge(patents[, c('pat_num', 'app_year', 'pat_year')], patent_assignee[, c('pat_num', 'assignee_id')], by = 'pat_num')
results <- data.table(assignee_id = as.character(),
 app_year = as.numeric(),
 portfolio_size = as.numeric())
for(year in seq(1980, 2014)){
 results <- rbind(results, patents_portfolio[pat_year < year, ][, list( portfolio_size = .N, app_year = year), by = 'assignee_id'])
}
patents_portfolio <- merge(patents_portfolio, results, by = c('assignee_id', 'app_year'))
patents_portfolio <- patents_portfolio[, list(portfolio_size = max(portfolio_size, na.rm = TRUE)), by = 'pat_num']
patents_portfolio_citing <- copy(patents_portfolio)
patents_portfolio_cited <- copy(patents_portfolio)
colnames(patents_portfolio_citing) <- c('citing_pat_num', 'citing_portfolio_size')
colnames(patents_portfolio_cited) <- c('cited_pat_num', 'cited_portfolio_size')
patents <- merge(patents, patents_portfolio, by = 'pat_num', all.x = TRUE)
cites <- merge(cites, patents_portfolio_citing, by = 'citing_pat_num', all.x = TRUE)
cites <- merge(cites, patents_portfolio_cited, by = 'cited_pat_num', all.x = TRUE)
patents[is.na(portfolio_size), portfolio_size := 0]
cites[ is.na(citing_portfolio_size), citing_portfolio_size := 0]
cites[ is.na(cited_portfolio_size), cited_portfolio_size := 0]

# Firm pool size and minimum citing year.
pat_merge <- patent_assignee[, c( 'pat_num', 'assignee_id')]
colnames(pat_merge) <- c('citing_pat_num', 'citing_assignee_id')
cites_assg <- merge(cites_sub, pat_merge, by = 'citing_pat_num')
cites_assg[ , min_citing_pat_year_firm := min(citing_pat_year), by = c('cited_pat_num', 'citing_assignee_id')]
cites_assg_unique <- unique(cites_assg[, c('citing_assignee_id', 'cited_pat_num', 'min_citing_pat_year_firm')])
results <- data.table(citing_assignee_id = as.character(),
 citing_app_year = as.numeric(),
 citing_pool_size_firm = as.numeric())
for(year in seq(1980, 2014)){
 results <- rbind(results, cites_assg_unique[min_citing_pat_year_firm < year, ][, list( citing_pool_size_firm = .N,
 citing_app_year = year), by = 'citing_assignee_id'])
}
cites_assg <- merge(cites_assg, results, by = c('citing_assignee_id', 'citing_app_year'))
cites_assg <- cites_assg[, list(min_citing_pat_year_firm = min(min_citing_pat_year_firm, na.rm = TRUE),
 citing_pool_size_firm = max(citing_pool_size_firm, na.rm = TRUE)), by = c('citing_pat_num', 'cited_pat_num')]
cites <- merge(cites, cites_assg, by = c('citing_pat_num', 'cited_pat_num'), all.x = TRUE)
cites[is.na(citing_pool_size_firm), citing_pool_size_all := 0]
cites[is.na(min_citing_pat_year_firm), min_citing_pat_year_firm := citing_pat_year]
cites_pool <- merge(cites_sub, patent_inventor, by.x = 'citing_pat_num', by.y = 'pat_num', allow.cartesian = TRUE)
cites_pool <- cites_pool[, list(citing_pat_year = min(citing_pat_year)), c('inventor_id', 'cited_pat_num')]
patents_pool <- patents[pat_year <= 2014, c('pat_num', 'app_year')]
patents_pool <- merge(patents_pool, patent_inventor, by = 'pat_num')
patents_pool <- merge(patents_pool, cites_pool, by = 'inventor_id', allow.cartesian = TRUE)
patents_pool <- patents_pool[app_year > citing_pat_year, c('pat_num', 'cited_pat_num')]
patents_pool <- unique(patents_pool)
patents_pool <- patents_pool[, list(pool_size_common = .N), by = 'pat_num']
colnames(patents_pool) <- c('citing_pat_num', 'citing_pool_size_common')
cites <- merge(cites, patents_pool, by = 'citing_pat_num', all.x = TRUE)
cites[is.na(citing_pool_size_common), citing_pool_size_common := 0]

# Sample.
cites_sub <- cites[is_self == 0 & !is.na(cited_pat_num) & citing_pat_year <= 2014, c('citing_pat_num', 'cited_pat_num', 'citing_pat_year', 'citing_app_year')]
samp_cites <- patents[pat_year >= 2002 &
 pat_year <= 2014 &
 assg_count == 1 &
 is_continuation == 0 &
 (is_domestic == 1 | is_foreign == 1), c('pat_num')]
samp_cites <- cites_sub[citing_pat_num %in% samp_cites$pat_num, c('citing_pat_num', 'cited_pat_num', 'citing_app_year')]
samp_cites <- merge(samp_cites, patent_inventor, by.x = 'citing_pat_num', by.y = 'pat_num', allow.cartesian = TRUE)
samp_cites <- samp_cites[, c('citing_pat_num', 'cited_pat_num', 'citing_app_year', 'inventor_id')]
pat_merge <- merge(patent_inventor, cites_sub, by.x = c('pat_num'), by.y = c('citing_pat_num'), allow.cartesian = TRUE)
pat_merge <- pat_merge[pat_num %in% patents[is_continuation == 0,]$pat_num]
pat_merge[, citing_app_year := NULL]
colnames(pat_merge) <- c('prior_pat_num', 'inventor_id', 'cited_pat_num', 'prior_pat_year')
samp_cites <- merge(samp_cites, pat_merge, by = c('inventor_id', 'cited_pat_num'), allow.cartesian = TRUE)
samp_cites <- samp_cites[prior_pat_year < citing_app_year, c('citing_pat_num', 'cited_pat_num', 'inventor_id', 'prior_pat_num')]
samp_cites <- samp_cites[, list(common_inventor_count = .N), by = c('citing_pat_num', 'cited_pat_num')]
cites <- merge(cites, samp_cites, by = c('citing_pat_num', 'cited_pat_num'), all.x = TRUE)
cites[is.na(common_inventor_count), common_inventor_count := 0]

cites[, is_rej := pmax(is_102, is_103)]
cites[, is_exam_cite := is_exam_cite_local]
cites[, is_appl_cite := is_appl_cite_local]
cites[, c('is_exam_cite_local', 'is_appl_cite_local') := NULL]
cite_counts <- cites[, list(back_cite_count_all = .N,
 back_cite_count_appl = sum(is_appl_cite, na.rm = TRUE)), by = 'citing_pat_num']
patents <- merge(patents, cite_counts, by.x = 'pat_num', by.y = 'citing_pat_num', all.x = TRUE)
sample_patents <- patents[pat_year %between% c(2002, 2014) &
 assg_count == 1 &
 is_continuation == 0 &
 (is_domestic == 1 | is_foreign == 1), ]
sample_patents <- merge(sample_patents, patent_assignee[, c('pat_num', 'assignee_id')], by = 'pat_num')
sample_cites <- cites[citing_pat_num %in% sample_patents$pat_num & 
 !is.na(cited_pat_num) &
 is_self == 0 & 
 !is.na(citing_num_claims) & citing_num_claims > 0 &
 citing_app_year > min_citing_pat_year_firm, ]
sample_cites[, is_common := ifelse(common_inventor_count >= 1, 1, 0)]
sample_cites[, dummy := NA]
sample_cites[, cite_id := paste(as.character(citing_pat_num), as.character(cited_pat_num), sep = '_')]
sample_cites <- unique(sample_cites, by = 'cite_id')[!is.na(cite_id),]
cites_alt <- cites[is_exam_cite == 0, c('citing_pat_num', 'cited_pat_num')]
colnames(cites_alt) <- c('citing_pat_num', 'cited_alt_num')
not_withheld <- sample_cites[is_exam_cite == 1, c('citing_pat_num', 'cited_pat_num', 'cite_id')]
not_withheld <- merge(not_withheld, cites_alt, by = 'citing_pat_num')
sample_cites[, is_exam_cite_believable := ifelse(is_exam_cite == 1 & 
 (is.na(is_exam_cite_uspto) | is_exam_cite_uspto == 1) & 
 !(cite_id %in% not_withheld[max_sim >= .8,]$cite_id), 1, 0)]
sample_cites[, is_appl_cite_believable := 1 - is_exam_cite_believable]

# Tables.
tab_1_summary <- data.table(var_name = c('citing_app_year', 'cited_app_year', 'citing_pat_year', 'cited_pat_year',
 'dummy', 'is_appl_cite', 'common_inventor_count',
 'dummy', 'citing_is_lawyer', 'citing_is_foreign', 'citing_exam_time', 'citing_num_claims', 'cited_num_claims',
 'dummy', 'citing_pool_size_firm', 'citing_pool_size_common', 
 'dummy', 'is_rej', 'is_appl_cite_believable'),
 Variable = c('Citing application year', 'Cited application year', 'Citing grant year', 'Cited grant year', 
 '\\textit{Replication variables}', '\\hspace{1em}Applicant-added', '\\hspace{1em}Common inventors', 
 '\\textit{Control variables}', '\\hspace{1em}Attorney or agent', '\\hspace{1em}Non-U.S. firm', '\\hspace{1em}Examination time', '\\hspace{1em}Citing claims', '\\hspace{1em}Cited claims',
 '\\textit{Previously cited patents}', '\\hspace{1em}By firm', '\\hspace{1em}By common inventor', 
 '\\textit{Additional variables}', '\\hspace{1em}Rejection (102 or 103)', '\\hspace{1em}Applicant-added (first)'),
 MN_lamp_firm = c(1999.18, 1987.4, 2001.52, 1989.29, NA, 0.67, 1.12, NA, 0.94, 0.28, 2.34, 21.84, 15.73, NA, NA, NA, NA, NA, NA), 
 SD_lamp_firm = c(1.16, 5.85, 0.5, 5.75, NA, 0.47, 3.89, NA, 0.24, 0.45, 1.06, 18.70, 14.09, NA, NA, NA, NA, NA, NA),
 MN_repl_firm_2002 = as.numeric(NA), 
 SD_repl_firm_2002 = as.numeric(NA),
 MN_repl_firm_full = as.numeric(NA), 
 SD_repl_firm_full = as.numeric(NA),
 MN_lamp_comm = c(1999.33, 1987.03, 2001.52, 1989.89, NA, 0.79, 3.52, NA, 0.95, 0.23, 2.19, 24.47, 15.89, NA, NA, NA, NA, NA, NA), 
 SD_lamp_comm = c(1.13, 6.03, 0.5, 5.92, NA, 0.41, 6.26, NA, 0.22, 0.42, 1.01, 24.97, 13.86, NA, NA, NA, NA, NA, NA), 
 MN_repl_comm_2002 = as.numeric(NA), 
 SD_repl_comm_2002 = as.numeric(NA),
 MN_repl_comm_full = as.numeric(NA), 
 SD_repl_comm_full = as.numeric(NA), 
 stringsAsFactors = FALSE)
for (i in seq(1, nrow(tab_1_summary)-5)){
 var_name <- tab_1_summary[i,]$var_name
 print(var_name)
 tab_1_summary[i,]$MN_repl_firm_full <- abs(mean(sample_cites[, get(var_name), ], na.rm = TRUE))
 tab_1_summary[i,]$SD_repl_firm_full <- sd( sample_cites[, get(var_name), ], na.rm = TRUE)
 tab_1_summary[i,]$MN_repl_comm_full <- abs(mean(sample_cites[ common_inventor_count > 0, get(var_name), ], na.rm = TRUE))
 tab_1_summary[i,]$SD_repl_comm_full <- sd( sample_cites[ common_inventor_count > 0, get(var_name), ], na.rm = TRUE)
 
 tab_1_summary[i,]$MN_repl_firm_2002 <- abs(mean(sample_cites[citing_pat_year == 2002, get(var_name), ], na.rm = TRUE))
 tab_1_summary[i,]$SD_repl_firm_2002 <- sd( sample_cites[citing_pat_year == 2002, get(var_name), ], na.rm = TRUE)
 tab_1_summary[i,]$MN_repl_comm_2002 <- abs(mean(sample_cites[citing_pat_year == 2002 & common_inventor_count > 0, get(var_name), ], na.rm = TRUE))
 tab_1_summary[i,]$SD_repl_comm_2002 <- sd( sample_cites[citing_pat_year == 2002 & common_inventor_count > 0, get(var_name), ], na.rm = TRUE)
}
tab_1_summary <- data.table(format(tab_1_summary, digits = 2, big.mark = ',', nnormal = 2))
for(i in seq(nrow(tab_1_summary)-4, nrow(tab_1_summary)-3)){
 var_name <- tab_1_summary[i,]$var_name
 tab_1_summary[i,]$MN_repl_firm_full <- format(abs(mean(sample_cites[, get(var_name), ], na.rm = TRUE)), digits = 0, big.mark = ',', nnormal = 2)
 tab_1_summary[i,]$SD_repl_firm_full <- format( sd( sample_cites[, get(var_name), ], na.rm = TRUE), digits = 0, big.mark = ',', nnormal = 2)
 tab_1_summary[i,]$MN_repl_comm_full <- format(abs(mean(sample_cites[ common_inventor_count > 0, get(var_name), ], na.rm = TRUE)), digits = 0, big.mark = ',', nnormal = 2)
 tab_1_summary[i,]$SD_repl_comm_full <- format( sd( sample_cites[ common_inventor_count > 0, get(var_name), ], na.rm = TRUE), digits = 0, big.mark = ',', nnormal = 2)
 
 tab_1_summary[i,]$MN_repl_firm_2002 <- format(abs(mean(sample_cites[citing_pat_year == 2002, get(var_name), ], na.rm = TRUE)), digits = 0, big.mark = ',', nnormal = 2)
 tab_1_summary[i,]$SD_repl_firm_2002 <- format( sd( sample_cites[citing_pat_year == 2002, get(var_name), ], na.rm = TRUE), digits = 0, big.mark = ',', nnormal = 2)
 tab_1_summary[i,]$MN_repl_comm_2002 <- format(abs(mean(sample_cites[citing_pat_year == 2002 & common_inventor_count > 0, get(var_name), ], na.rm = TRUE)), digits = 0, big.mark = ',', nnormal = 2)
 tab_1_summary[i,]$SD_repl_comm_2002 <- format( sd( sample_cites[citing_pat_year == 2002 & common_inventor_count > 0, get(var_name), ], na.rm = TRUE), digits = 0, big.mark = ',', nnormal = 2)
}
for (i in seq(nrow(tab_1_summary)-2, nrow(tab_1_summary))){
 var_name <- tab_1_summary[i,]$var_name
 tab_1_summary[i,]$MN_repl_firm_full <- format(abs(mean(sample_cites[, get(var_name), ], na.rm = TRUE)), digits = 2, big.mark = ',', nnormal = 2)
 tab_1_summary[i,]$SD_repl_firm_full <- format( sd( sample_cites[, get(var_name), ], na.rm = TRUE), digits = 2, big.mark = ',', nnormal = 2)
 tab_1_summary[i,]$MN_repl_comm_full <- format(abs(mean(sample_cites[common_inventor_count > 0, get(var_name), ], na.rm = TRUE)), digits = 2, big.mark = ',', nnormal = 2)
 tab_1_summary[i,]$SD_repl_comm_full <- format( sd( sample_cites[common_inventor_count > 0, get(var_name), ], na.rm = TRUE), digits = 2, big.mark = ',', nnormal = 2)
}
tab_1_summary[, var_name := NULL]
tab_1_summary <- data.table(apply(apply(tab_1_summary, 2, function(x) gsub("NaN", "", x)), 2, function(x) gsub("NA", "", x)))
tab_1_summary <- rbind(tab_1_summary, list('Observations', 
 '126,340', '',
 formatC(nrow(sample_cites[citing_pat_year == 2002,]), digits = 0, format = 'f', big.mark=','), '',
 formatC(nrow(sample_cites), digits = 0, format = 'f', big.mark=','), '',
 '40,085', '',
 formatC(nrow(sample_cites[citing_pat_year == 2002 & common_inventor_count > 0]), digits = 0, format = 'f', big.mark=','), '',
 formatC(nrow(sample_cites[common_inventor_count > 0]), digits = 0, format = 'f', big.mark=','), ''))
tab_1_xt <- xtable(tab_1_summary[, c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)], caption = 'Descriptive Statistics', align = 'cp{1.65in}rrrrrrrrrrrr', label = 'tab_1a')
names(tab_1_xt) <- c('Variable', 'Mean', 'S.D.', 'Mean', 'S.D.', 'Mean', 'S.D.', 'Mean', 'S.D.', 'Mean', 'S.D.', 'Mean', 'S.D.')
print.xtable(tab_1_xt, 
 hline.after = c(0, nrow(tab_1_summary)-1, nrow(tab_1_summary)),
 include.rownames = FALSE, sanitize.text.function = identity, size = 'footnotesize',
 tabular.environment = 'tabular*', width = "\\linewidth", 
 caption.placement = 'top', sanitize.colnames.function = identity, 
 file = file.path(PATH['Latex'], 'TABLE_1.tex'))

sample_cites[, citing_pool_size_firm_lg_dm := log(citing_pool_size_firm + 1) - mean(log(citing_pool_size_firm + 1))]
fit1 <- lm(data = sample_cites, is_exam_cite ~ citing_pool_size_firm_lg_dm)
fit2 <- lm(data = sample_cites, is_exam_cite ~ citing_pool_size_firm_lg_dm + citing_is_lawyer + citing_exam_time + citing_num_claims + cited_num_claims + citing_is_foreign)
sample_cites_common <- sample_cites[common_inventor_count > 0, ]
sample_cites_common[, citing_pool_size_common_lg_dm := log(citing_pool_size_common + 1) - mean(log(citing_pool_size_common + 1))]
sample_cites_common[, citing_pool_size_firm_lg_dm := log(citing_pool_size_firm + 1) - mean(log(citing_pool_size_firm + 1))]
fit3 <- lm(data = sample_cites_common, is_exam_cite ~ citing_pool_size_firm_lg_dm)
fit4 <- lm(data = sample_cites_common, is_exam_cite ~ citing_pool_size_firm_lg_dm + citing_is_lawyer + citing_exam_time + citing_num_claims + cited_num_claims + citing_is_foreign)
sample_cites_common[, citing_pool_size_common_lg := log(citing_pool_size_common + 1)]
sample_cites_rej <- sample_cites[common_inventor_count > 0 & is_rej == 1 & common_inventor_count > 0 & is_rej == 1,]
fit5 <- lm(data = sample_cites_rej, is_exam_cite ~ citing_pool_size_common_lg_dm)
fit6 <- lm(data = sample_cites_rej, is_exam_cite ~ citing_pool_size_common_lg_dm + citing_is_lawyer + citing_exam_time + citing_num_claims + cited_num_claims + citing_is_foreign)
sample_cites_rej[, citing_pool_size_firm_lg_dm := log(citing_pool_size_firm + 1) - mean(log(citing_pool_size_firm + 1))]
stargazer(fit1, fit2, fit3, fit4, fit5, fit6, 
 title = 'Previous citations and examiner citation',
 header = FALSE,
 align = TRUE,
 no.space = TRUE,
 dep.var.labels.include = FALSE,
 dep.var.caption = "",
 column.labels = c('Full sample', 'Common inventor', 'Restricted'),
 column.separate = c(2, 2, 2),
 column.sep.width = '-35pt',
 covariate.labels = c('Previously-cited patents', 'Attorney or agent', 'Examination time', 'Citing claims', 'Cited claims', 'Non-U.S. firm'),
 omit.stat = c('adj.rsq', 'f', 'ser'),
 df = FALSE,
 font.size = 'small',
 omit.table.layout = "n",
 star.cutoffs = c(0.05, 0.01, 0.001),
 out = file.path(PATH['Latex'], 'TABLE_2.tex'))

sample_cites_count1 <- sample_cites
sample_cites_count2 <- sample_cites[common_inventor_count > 0,]
sample_cites_count1[, cite_type := 'Full sample']
sample_cites_count2[, cite_type := 'Common inventor subsample']
sample_cites_count <- rbind(sample_cites_count1, sample_cites_count2)
sample_cites_count[, cite_type := factor(cite_type, levels = c('Full sample', 'Common inventor subsample'))]
p <- ggplot(sample_cites_count, aes(x = citing_pat_date, y = 100*is_exam_cite, linetype = cite_type)) + theme_bw() + GG_theme +
 geom_smooth(color = 'black', se = FALSE) + 
 coord_cartesian(ylim = c(0, 35), xlim = ymd(c('2002-01-01', '2014-04-01'))) + 
 xlab('Citing patent grant date') + 
 ylab('Percentage of relevant citations') + 
 scale_linetype_manual(breaks = c('Full sample', 'Common inventor subsample'), values = c('dotted', 'solid')) + 
 scale_x_date(expand = c(0,0), date_breaks = '2 years', date_labels = '%Y') + 
 scale_y_continuous(expand = c(0,0), breaks = seq(0, 40, 10)) + 
 theme(legend.position = c(.4,.2),
 legend.title = element_blank())
ggsave(filename = file.path(PATH['Figures'], 'Fig_1a.png'), plot = p, width = GG_size, height = GG_size, dpi = GG_dpi)

sample_cites[, is_rej := pmax(is_102, is_103)]
sample_cites_count1 <- sample_cites[is_exam_cite == 1, list(count = length(unique(citing_pat_num))), by = 'citing_pat_year']
sample_cites_count2 <- sample_cites[is_exam_cite == 1 & common_inventor_count > 0,][, list(count = length(unique(citing_pat_num))), by = 'citing_pat_year']
sample_cites_count4 <- sample_cites[is_exam_cite_believable == 1 & common_inventor_count > 0 & is_rej == 1,][, list(count = length(unique(citing_pat_num))), by = 'citing_pat_year']
sample_cites_count1[, cite_type := 'Full sample']
sample_cites_count2[, cite_type := 'Common inventor subsample']
sample_cites_count4[, cite_type := 'Restricted subsample']
sample_cites_count <- rbind(sample_cites_count1, sample_cites_count2, sample_cites_count4)
sample_cites_count <- merge(sample_cites_count, patents[, list(pat_count = .N), by = 'pat_year'], by.x = 'citing_pat_year', by.y = 'pat_year')
sample_cites_count[, count_per := 100*count/pat_count]
sample_cites_count[, cite_type := factor(cite_type, levels = c('Full sample', 'Common inventor subsample', 'Restricted subsample'))]
sample_cites_count <- sample_cites_count[citing_pat_year >= 2005,]
p <- ggplot(sample_cites_count, aes(x = citing_pat_year, y = count_per, linetype = cite_type)) + theme_bw() + GG_theme +
 geom_point(size = .5) + geom_line() + 
 coord_cartesian(xlim = c(2002, 2014.3), ylim = c(0, 35)) + 
 xlab('Citing patent grant date') + 
 ylab('Percentage of all patents') + 
 scale_linetype_manual(values = c('dotted', 'dashed', 'longdash', 'solid')) + 
 scale_x_continuous(expand = c(0,0), breaks = seq(2002, 2014, 2)) +
 scale_y_continuous(expand = c(0,0)) + 
 theme(legend.title = element_blank(), legend.position = c(.7, .8))
ggsave(filename = file.path(PATH['Figures'], 'Fig_1b.png'), plot = p, width = GG_size, height = GG_size, dpi = GG_dpi)

sample_cites_tmp1 <- sample_cites[, list(citing_pool_size = mean(citing_pool_size_firm, na.rm = TRUE)), by = 'citing_pat_num']
sample_cites_tmp2 <- sample_cites[common_inventor_count > 0 & !is.na(citing_pool_size_common), ][,list(citing_pool_size = mean(citing_pool_size_common, na.rm = TRUE)), by = 'citing_pat_num']
sample_cites_tmp1[, sample_label := 'Cited by the firm']
sample_cites_tmp2[, sample_label := 'Cited by a common inventor']
sample_cites_tmp <- rbind(sample_cites_tmp1, sample_cites_tmp2)
p <- ggplot(sample_cites_tmp, aes(x = citing_pool_size)) + theme_bw() + GG_theme +
 geom_density(aes(linetype = sample_label)) + 
 xlab('Previously cited patents') + 
 ylab('Density') + 
 coord_cartesian(xlim = c(1, 300000)) + 
 scale_x_log10(expand = c(0,0), breaks = c(1, 10, 100, 1000, 10000, 100000)) + 
 scale_y_continuous(expand = c(0,0)) + 
 theme(axis.ticks.y = element_blank(),
 axis.text.y = element_blank(),
 legend.position = c(.72,.9),
 legend.title = element_blank())
ggsave(filename = file.path(PATH['Figures'], 'Fig_2.png'), plot = p, width = GG_size, height = GG_size, dpi = GG_dpi)

Fig_3_set_names <- c('Common inventor subsample',
 'Rejections', 
 'Pool size inventor < 100')
Fig_3_set1 <- sample_cites[common_inventor_count > 0, ]
Fig_3_set2 <- sample_cites[is_rej == 1 , ]
Fig_3_set3 <- sample_cites[citing_pool_size_common < 100, ]
Fig_3A_list <- list( Fig_3_set1$cite_id, Fig_3_set2$cite_id, Fig_3_set3$cite_id)
Fig_3B_list <- list( Fig_3_set1[is_exam_cite_believable == 1, ]$cite_id, Fig_3_set2[is_exam_cite_believable == 1, ]$cite_id, Fig_3_set3[is_exam_cite_believable == 1, ]$cite_id)
Fig_3C_list <- list(unique(Fig_3_set1$citing_pat_num), unique(Fig_3_set2$citing_pat_num), unique(Fig_3_set3$citing_pat_num))
Fig_3D_list <- list(unique(Fig_3_set1[is_exam_cite_believable == 1, ]$citing_pat_num), unique(Fig_3_set2[is_exam_cite_believable == 1, ]$citing_pat_num), unique(Fig_3_set3$citing_pat_num))
venn.diagram(x = Fig_3A_list, category.names = Fig_3_set_names, filename = 'Fig_3A_cite_Relevant.png')
venn.diagram(x = Fig_3B_list, category.names = Fig_3_set_names, filename = 'Fig_3B_cite_Withheld.png')
venn.diagram(x = Fig_3C_list, category.names = Fig_3_set_names, filename = 'Fig_3C_pat_Relevant.png')
venn.diagram(x = Fig_3D_list, category.names = Fig_3_set_names, filename = 'Fig_3D_pat_Withheld.png')

